library(tidyverse)
library(ggthemes)
library(knitr)
library(broom)
library(stringr)
options(digits = 3)
set.seed(1234)Goal is to maximize the data-ink ratio
\[\text{Data-ink ratio} = \frac{\text{data-ink}}{\text{total ink used to print the graphic}}\]
He never offers proof of his hypothesis that less is better
p <- ggplot(mpg, aes(cty, hwy)) +
geom_point()
pWhat happens if we strip away everything except the data?
p + theme_void()Hmm, so what do we actually need to keep? What should we consider “integral”? What if we remove the background color?
p + theme_bw()p + theme_bw() +
theme(panel.background = element_blank(),
panel.border = element_blank())p + theme_bw() +
theme(panel.background = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
panel.grid.minor = element_blank())p + theme_bw() +
theme(panel.background = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
panel.grid = element_blank())p + theme_bw() +
theme(panel.background = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank())p + theme_bw(base_family = "serif") +
theme(panel.background = element_blank(),
panel.border = element_blank(),
strip.background = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank())What have we lost? Is this easier to interpret? Harder?
ggplot2 - no support for themggplotThe duck
Tufte concludes that forgoing chartjunk enables functionality and insight (as Cairo would describe it). Do you agree?
ggplot2The goal of Tufte’s minimalism is to maximize the data-ink ratio, so we want to modify traditional or default graphs in R and ggplot2 to minimize use of extraneous ink.
x <- 1967:1977
y <- c(0.5, 1.8, 4.6, 5.3, 5.3, 5.7, 5.4, 5, 5.5, 6, 5)
d <- data_frame(x, y)
ggplot(d, aes(x, y)) +
geom_line() +
geom_point() +
scale_y_continuous(breaks = seq(1, 6, 1), label = sprintf("$%s", seq(300, 400, 20))) +
labs(title = "Per capita budget expandures",
x = "Year",
y = "Per capita budget expandures\nin constant dollars")geom_point() to draw the data points and geom_line() to connect the pointsggplot(d, aes(x, y)) +
geom_line() +
geom_point(size = 3) +
theme_tufte(base_size = 15) +
theme(axis.title = element_blank()) +
geom_hline(yintercept = c(5, 6), lty = 2) +
scale_y_continuous(breaks = seq(1, 6, 1), label = sprintf("$%s", seq(300, 400, 20))) +
scale_x_continuous(breaks = x, label = x) +
annotate(
"text",
x = c(1977, 1977.2),
y = c(1.5, 5.5),
adj = 1,
family = "serif",
label = c("Per capita\nbudget expandures\nin constant dollars", "5%")
)ggplot(quakes, aes(factor(mag), stations)) +
geom_boxplot() +
labs(title = "Fiji earthquakes",
x = "Richter Magnitude",
y = "Number of stations reporting earthquakes")ggplot(quakes, aes(factor(mag), stations)) +
theme_tufte() +
geom_tufteboxplot(outlier.colour = "transparent") +
theme(axis.title = element_blank()) +
annotate(
"text",
x = 8,
y = 120,
adj = 1,
family = "serif",
label = c(
"Number of stations \nreporting Richter Magnitude\nof Fiji earthquakes (n=1000)"
)
)ggplot(quakes, aes(factor(mag), stations)) +
theme_tufte() +
geom_tufteboxplot(median.type = "line") +
theme(axis.title = element_blank()) +
annotate(
"text",
x = 8,
y = 120,
adj = 1,
family = "serif",
label = c(
"Number of stations \nreporting Richter Magnitude\nof Fiji earthquakes (n=1000)"
)
)library(psych)
library(reshape2)
d <- melt(colMeans(msq[, c(2, 7, 34, 36, 42, 43, 46, 55, 68)], na.rm = T) *
10)
d$trait <- rownames(d)
p <- ggplot(d, aes(x = trait, y = value)) +
geom_bar(stat = "identity") +
scale_y_continuous(breaks = seq(1, 5, 1)) +
labs(title = "Watson et al., 1998",
subtitle = "N = 3896",
x = "Negative emotion traits",
y = "Average score")
pggplot2 offers other default backgroundsp + theme_bw()p + theme_dark()p + theme_classic()p + theme_minimal()p + theme_void()ggplot(d, aes(x = trait, y = value)) +
theme_tufte(base_size = 14, ticks = F) +
geom_bar(width = 0.25, fill = "gray", stat = "identity") +
theme(axis.title = element_blank()) +
scale_y_continuous(breaks = seq(1, 5, 1)) +
geom_hline(yintercept = seq(1, 5, 1),
col = "white",
lwd = 1) +
annotate(
"text",
x = 3.5,
y = 5,
adj = 1,
family = "serif",
label = c(
"Average scores\non negative emotion traits
from 3896 participants\n(Watson et al., 1988)"
)
)ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
xlab("Car weight (lb/1000)") +
ylab("Miles per gallon of fuel")ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
geom_rangeframe() +
theme_tufte() +
xlab("Car weight (lb/1000)") +
ylab("Miles per gallon of fuel") +
theme(axis.title.x = element_text(vjust = -0.5),
axis.title.y = element_text(vjust = 1.5))library(devtools)
source_url('https://raw.githubusercontent.com/bearloga/Quartile-frame-Scatterplot/master/qfplot.R')## SHA-1 hash of file is fe88d63ea7111be1a61ea5d36df1bb9c196fba73
qfplot(
x = mtcars$wt,
y = mtcars$mpg,
xlab = "Car weight (lb/1000)",
ylab = "Miles per gallon of fuel"
)Traditional bar chart of crime in the city of San Francisco, 2009-10. Source: Visualizing Time with the Double-Time Bar Chart
Double-time bar chart of crime in the city of San Francisco, 2009-10. Source: Visualizing Time with the Double-Time Bar Chart
Enables representing trends over a 24 hour period without breaking arbitrarily at midnight
Chart from Harvard magazine. Source: Involuntary head-shaking is probably not an intended consequence of data visualization
Redesigned chart from Harvard magazine. Source: Involuntary head-shaking is probably not an intended consequence of data visualization
Source: Figures 4-6 from Bateman, Scott, et al. “Useful junk?: the effects of visual embellishment on comprehension and memorability of charts.” Proceedings of the SIGCHI Conference on Human Factors in Computing Systems. ACM, 2010.
library(twitteR)##
## Attaching package: 'twitteR'
## The following objects are masked from 'package:dplyr':
##
## id, location
library(tidytext)
# setup API authentication
setup_twitter_oauth(getOption("twitter_api_key"),
getOption("twitter_api_token"))## [1] "Using browser based authentication"
# get fresh trump tweets
trump <- userTimeline("realDonaldTrump", n = 3200) %>%
twListToDF %>%
as_tibble
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))" # custom regular expression to tokenize tweets
# tokenize
trump_token <- trump %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))library(wordcloud)## Loading required package: RColorBrewer
library(RColorBrewer)
pal <- brewer.pal(8, "Set1")
trump_token %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100, scale = c(2,.5),
colors = pal))# get tweets
pope <- userTimeline("Pontifex", n = 3200) %>%
twListToDF %>%
as_tibble
# tokenize
pope_token <- pope %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))library(reshape2)
bind_rows(Trump = trump_token, Pope = pope_token, .id = "person") %>%
count(word, person) %>%
acast(word ~ person, value.var = "n", fill = 0) %>%
comparison.cloud(max.words = 100, colors = c("blue", "red"))The size of a word’s text is in proportion to its frequency within its category (i.e. proportion of all Trump tweets or all pope tweets). We can use this visualization to see the most frequent words/hashtags by President Trump and Pope Francis, but the sizes of the words are not comparable across sentiments.
bind_rows(Trump = trump_token, Pope = pope_token, .id = "person") %>%
count(word, person) %>%
spread(person, n, fill = 0) %>%
ggplot(aes(Pope, Trump, label = word)) +
geom_text() +
geom_abline(slope = 1, linetype = 2)Revised
What are the advantages/disadvantages to the second plot?
devtools::session_info()## Session info -------------------------------------------------------------
## setting value
## version R version 3.4.3 (2017-11-30)
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz America/Chicago
## date 2018-03-08
## Packages -----------------------------------------------------------------
## package * version date source
## assertthat 0.2.0 2017-04-11 CRAN (R 3.4.0)
## backports 1.1.2 2017-12-13 CRAN (R 3.4.3)
## base * 3.4.3 2017-12-07 local
## bindr 0.1 2016-11-13 CRAN (R 3.4.0)
## bindrcpp 0.2 2017-06-17 CRAN (R 3.4.0)
## bit 1.1-12 2014-04-09 CRAN (R 3.4.0)
## bit64 0.9-7 2017-05-08 CRAN (R 3.4.0)
## broom * 0.4.3 2017-11-20 CRAN (R 3.4.1)
## cellranger 1.1.0 2016-07-27 CRAN (R 3.4.0)
## cli 1.0.0 2017-11-05 CRAN (R 3.4.2)
## colorspace 1.3-2 2016-12-14 CRAN (R 3.4.0)
## compiler 3.4.3 2017-12-07 local
## crayon 1.3.4 2017-10-03 Github (gaborcsardi/crayon@b5221ab)
## curl 3.1 2017-12-12 CRAN (R 3.4.3)
## datasets * 3.4.3 2017-12-07 local
## DBI 0.7 2017-06-18 CRAN (R 3.4.0)
## devtools 1.13.5 2018-02-18 CRAN (R 3.4.3)
## digest 0.6.15 2018-01-28 CRAN (R 3.4.3)
## dplyr * 0.7.4.9000 2017-10-03 Github (tidyverse/dplyr@1a0730a)
## evaluate 0.10.1 2017-06-24 CRAN (R 3.4.1)
## forcats * 0.3.0 2018-02-19 CRAN (R 3.4.3)
## foreign 0.8-69 2017-06-22 CRAN (R 3.4.3)
## ggplot2 * 2.2.1 2016-12-30 CRAN (R 3.4.0)
## ggthemes * 3.4.0 2017-02-19 CRAN (R 3.4.0)
## glue 1.2.0 2017-10-29 CRAN (R 3.4.2)
## graphics * 3.4.3 2017-12-07 local
## grDevices * 3.4.3 2017-12-07 local
## grid 3.4.3 2017-12-07 local
## gtable 0.2.0 2016-02-26 CRAN (R 3.4.0)
## haven 1.1.1 2018-01-18 CRAN (R 3.4.3)
## hms 0.4.1 2018-01-24 CRAN (R 3.4.3)
## htmltools 0.3.6 2017-04-28 CRAN (R 3.4.0)
## httr 1.3.1 2017-08-20 CRAN (R 3.4.1)
## janeaustenr 0.1.5 2017-06-10 CRAN (R 3.4.0)
## jsonlite 1.5 2017-06-01 CRAN (R 3.4.0)
## knitr * 1.20 2018-02-20 CRAN (R 3.4.3)
## lattice 0.20-35 2017-03-25 CRAN (R 3.4.3)
## lazyeval 0.2.1 2017-10-29 CRAN (R 3.4.2)
## lubridate 1.7.2 2018-02-06 CRAN (R 3.4.3)
## magrittr 1.5 2014-11-22 CRAN (R 3.4.0)
## Matrix 1.2-12 2017-11-20 CRAN (R 3.4.3)
## memoise 1.1.0 2017-04-21 CRAN (R 3.4.0)
## methods * 3.4.3 2017-12-07 local
## mnormt 1.5-5 2016-10-15 CRAN (R 3.4.0)
## modelr 0.1.1 2017-08-10 local
## munsell 0.4.3 2016-02-13 CRAN (R 3.4.0)
## nlme 3.1-131.1 2018-02-16 CRAN (R 3.4.3)
## openssl 1.0 2018-02-02 CRAN (R 3.4.3)
## parallel 3.4.3 2017-12-07 local
## pillar 1.1.0 2018-01-14 CRAN (R 3.4.3)
## pkgconfig 2.0.1 2017-03-21 CRAN (R 3.4.0)
## plyr 1.8.4 2016-06-08 CRAN (R 3.4.0)
## psych 1.7.8 2017-09-09 CRAN (R 3.4.1)
## purrr * 0.2.4 2017-10-18 CRAN (R 3.4.2)
## R6 2.2.2 2017-06-17 CRAN (R 3.4.0)
## Rcpp 0.12.15 2018-01-20 CRAN (R 3.4.3)
## readr * 1.1.1 2017-05-16 CRAN (R 3.4.0)
## readxl 1.0.0 2017-04-18 CRAN (R 3.4.0)
## reshape2 1.4.3 2017-12-11 CRAN (R 3.4.3)
## rjson 0.2.15 2014-11-03 CRAN (R 3.4.0)
## rlang 0.2.0 2018-02-20 cran (@0.2.0)
## rmarkdown 1.8 2017-11-17 CRAN (R 3.4.2)
## rprojroot 1.3-2 2018-01-03 CRAN (R 3.4.3)
## rstudioapi 0.7 2017-09-07 CRAN (R 3.4.1)
## rvest 0.3.2 2016-06-17 CRAN (R 3.4.0)
## scales 0.5.0 2017-08-24 cran (@0.5.0)
## SnowballC 0.5.1 2014-08-09 CRAN (R 3.4.0)
## stats * 3.4.3 2017-12-07 local
## stringi 1.1.6 2017-11-17 CRAN (R 3.4.2)
## stringr * 1.3.0 2018-02-19 CRAN (R 3.4.3)
## tibble * 1.4.2 2018-01-22 CRAN (R 3.4.3)
## tidyr * 0.8.0 2018-01-29 CRAN (R 3.4.3)
## tidytext * 0.1.7 2018-02-19 CRAN (R 3.4.3)
## tidyverse * 1.2.1 2017-11-14 CRAN (R 3.4.2)
## tokenizers 0.1.4 2016-08-29 CRAN (R 3.4.0)
## tools 3.4.3 2017-12-07 local
## twitteR * 1.1.9 2015-07-29 CRAN (R 3.4.0)
## utils * 3.4.3 2017-12-07 local
## withr 2.1.1 2017-12-19 CRAN (R 3.4.3)
## xml2 1.2.0 2018-01-24 CRAN (R 3.4.3)
## yaml 2.1.16 2017-12-12 CRAN (R 3.4.3)
Source for examples: Tufte in R↩